Linear Regression

  • Linear regression applied to stock market
  • Create a model that predicts closing price of a stock market
In [27]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
import plotly
plotly.offline.init_notebook_mode()
import datetime
In [2]:
dataset = pd.read_csv('petr4.csv')
In [3]:
# transform column Date to a datetime
dataset['Date'] = pd.to_datetime(dataset['Date'])
In [4]:
dataset.tail()
Out[4]:
Date Open High Low Close Volume
1797 2010-01-08 37.16 37.39 36.86 36.95 14624200
1798 2010-01-07 37.27 37.45 37.07 37.15 10964600
1799 2010-01-06 36.80 37.50 36.80 37.50 18720600
1800 2010-01-05 37.38 37.43 36.80 37.00 21396400
1801 2010-01-04 36.95 37.32 36.82 37.32 13303600
In [5]:
# including variation
dataset['Variation'] = dataset['Close'].sub(dataset['Open'])
In [6]:
dataset.head()
Out[6]:
Date Open High Low Close Volume Variation
0 2017-04-11 14.97 14.99 14.55 14.68 38392300 -0.29
1 2017-04-10 14.90 14.94 14.70 14.94 37541700 0.04
2 2017-04-07 14.61 14.90 14.60 14.70 32944900 0.09
3 2017-04-06 14.62 14.87 14.42 14.53 34386000 -0.09
4 2017-04-05 15.05 15.16 14.50 14.57 49623400 -0.48

Data Visualization

Plots stock price in the period

In [7]:
# Plots price values in the analyzed range from 2010 to 2017
x1=dataset.Date
y1=dataset.Close
data = [go.Scatter(x=x1, y=y1)]
layout = go.Layout(
    xaxis=dict(
        range=['01-01-2010','11-04-2017'],
        title='Year'              
    ),
    yaxis=dict(
        range=[min(x1), max(y1)],
        title='Stock Price'
    ))
fig = go.Figure(data = data, layout = layout)
py.iplot(fig)

Candlesticks of last 7 days

In [8]:
# Visualizing Candlesticks - last 7 days
dataset2 = dataset.head(7)
dados = go.Candlestick(x=dataset2.Date,
                       open=dataset2.Open,
                       high=dataset2.High,
                       low=dataset2.Low,
                       close=dataset2.Close,
                       )

data=[dados]
py.offline.iplot(data,filename='graph_candlestick')

Variation in the period

In [9]:
# Variation in the period
%matplotlib notebook
import matplotlib.dates as mdates
import datetime as dt
x = dataset['Date']
y = dataset['Variation']
plt.plot_date(x,y, color='r',fmt="r-")
plt.xticks(rotation=30)
plt.show()

Correlating features and class

In [10]:
training = dataset

Open and Close prices correlation

In [11]:
# Scatter plot of Open and Close prices from last 100 days
%matplotlib notebook
x = training.Open[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('open price')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.autoscale('False')
plt.show()

High and Close prices correlation

In [12]:
# Scatter plot of High and Close prices from last 100 days
%matplotlib notebook
x = training.High[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('high price')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.autoscale('False')
plt.show()

Low and Close prices correlation

In [13]:
# Scatter plot of Low and Close price from last 100 days 
%matplotlib notebook
x = training.Low[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('low price')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.autoscale('False')
plt.show()

Volume and Close prices correlatio

In [14]:
# Scatter plot of Volume and Close price from last 100 days
%matplotlib notebook
x = training.Volume[:100]
y = training.Close[:100]
plt.scatter(x,y,color='b')
plt.xlabel('Volume')
plt.ylabel('close price')
plt.axis([min(x),max(x),min(y),max(y)])
plt.ticklabel_format(style='plain', axis='x')
plt.autoscale('False')
plt.xticks(rotation=45)
plt.show()
In [15]:
features = ['Open','High','Low','Volume']
training = training[features]
In [16]:
training.head()
Out[16]:
Open High Low Volume
0 14.97 14.99 14.55 38392300
1 14.90 14.94 14.70 37541700
2 14.61 14.90 14.60 32944900
3 14.62 14.87 14.42 34386000
4 15.05 15.16 14.50 49623400
In [17]:
# Class dataset receives target attribute Close
y = dataset['Close']

Training the linear regression model

In [18]:
X_training, X_test, y_training, y_test = train_test_split(
training, y, random_state=42)
In [19]:
# Creates the linear regression model
lr_model = LinearRegression()
In [20]:
# Training the model
lr_model.fit(X_training,y_training)
Out[20]:
LinearRegression()
In [22]:
# Checking the coefficients
lr_model.coef_
Out[22]:
array([-6.88569832e-01,  7.56540315e-01,  9.33134404e-01,  7.75093093e-10])
In [25]:
# Predicting prices
results = lr_model.predict(X_test)
In [24]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
In [28]:
# Calculate the metrics for the results
regression_results(y_test,results)
explained_variance:  0.9996
mean_squared_log_error:  0.0001
r2:  0.9996
MAE:  0.1224
MSE:  0.0245
RMSE:  0.1564
In [33]:
%matplotlib notebook

prediction = pd.DataFrame(lr_model.predict(X_test))
actual = pd.DataFrame(y_test.values)

# Graphic style
plt.style.use("ggplot")

# Axis titles
plt.xlabel('Prices')
plt.ylabel('Indexes')
plt.title('Actual Prices vs Prediction')

# Sort values and plot lines
plt.plot(prediction.sort_values(by=0),prediction.index)
plt.plot(actual.sort_values(by=0),actual.index)

# Set graph labels
plt.legend(['Prediction','Actual Price'])
Out[33]:
<matplotlib.legend.Legend at 0x190dc668c18>